package cn.stronglink.crawler.service.handle.nrcc;

import java.util.Date;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Pattern;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import com.alibaba.fastjson.JSON;

import cn.stronglink.crawler.common.Hanyu;
import cn.stronglink.crawler.common.HtmlPage;
import cn.stronglink.crawler.common.RequestAndResponseTool;
import cn.stronglink.crawler.service.ICrawlerHandle;
import cn.stronglink.crawler.service.handle.nrcc.vo.ChemicalInfoVo;
import cn.stronglink.crawler.service.handle.nrcc.vo.ChemicalInfoVoRepository;
import cn.stronglink.crawler.vo.ChemicalLogVo;
import cn.stronglink.crawler.vo.ChemicalLogVoRepository;

/**
 * 对NRCC危化品网址的数据抓取
 * 
 * @author yuzhantao @Scope("prototype")
 *
 */
@Service
public class NrccHandle implements ICrawlerHandle {
	private final static Logger logger = LogManager.getLogger(NrccHandle.class);
	private final static String HOST = "http://service.nrcc.com.cn"; // 服务器地址
	private final static String SITE = HOST + "/Mobile/MyQuickWatch"; // 下载数据的首页
	private final static String QUERY_KEY_SPAN = "|"; // 查询关键字的分隔符
	private final static Hanyu hanyu = new Hanyu();
	private ExecutorService chemicalThreadPool = Executors.newFixedThreadPool(20); // 执行爬虫下载任务的线程池
	private Long downloadCount = 0L; // 下载总数
	@Autowired
	private ChemicalInfoVoRepository chemicalInfoVoRepository; // Nrcc网址抓取的数据库操作类
	@Autowired
	private ChemicalLogVoRepository chemicalLogVoRepository; // 危化品日志类

	@Override
	public boolean isHandle(String site) {
		return site.equals(SITE);
	}

	// @Transactional(rollbackFor = Exception.class, propagation =
	// Propagation.REQUIRED)
	@Override
	public void handle(String site) throws Exception {
		HtmlPage firstPage = RequestAndResponseTool.sendRequstAndGetResponse(site); // 读取数据第一页
		int maxPageNum = this.getMaxPageNum(firstPage); // 从第一页中获取最大页数
		chemicalInfoVoRepository.deleteAll(); // 清空所有数据
		logger.info("获取页面最大页数:{}", maxPageNum);
		for (int i = 1; i <= maxPageNum; i++) {
			String listUrl = site + "?SearchItem=&&SearchValue=&&page=" + i;
			logger.info("抓取化学品列表网址:{}", listUrl);
			this.downloadPageToDBFormListPage(listUrl);
		}
	}

	/**
	 * 从列表页获取需要下载的数据并下载
	 * 
	 * @param url
	 */
	private void downloadPageToDBFormListPage(String url) {
		int listUrlDownCount = 0;
		do {
			try {
				HtmlPage page = RequestAndResponseTool.sendRequstAndGetResponse(url);
				Elements aElements = page.getDoc().select("a[class=textblue]");
				// 遍历列表页中每个化学品的详情页面
				for (Element a : aElements) {
					chemicalThreadPool.execute(new Runnable() {

						@Override
						public void run() {
							String detailUrl = HOST + a.attr("href"); // 需要抓取数据的网址
							try {
								downloadPageToDB(detailUrl); // 下载网页数据并保存到数据库
								synchronized (downloadCount) {
									logger.info("已成功下载[{}]条数据", ++downloadCount);
								}
							} catch (Exception e) {
								saveExceptionLogToDB(detailUrl, e.getMessage()); // 保存异常信息到数据库
								logger.error("", e);
							}
						}
					});
				}
				break;
			} catch (Exception e) {
				logger.error("", e);
			}
		} while (listUrlDownCount++ < 5);
	}

	/**
	 * 下载数据到数据库(尝试10次下载，如果超过10次，就不再尝试下载)
	 * 
	 * @param url 需要抓取数据的网址
	 * @throws Exception
	 */
	private void downloadPageToDB(String url) throws Exception {
		int loopCount = 0;
		// 每个页面尝试下载10次，如果10次都失败，则记录到日志表并不在尝试下载
		do {
			try {
				logger.debug("抓取化学品详情网址:{}", url);
				HtmlPage detailPage = RequestAndResponseTool.sendRequstAndGetResponse(url); // 获取网页元素
				ChemicalInfoVo ci = parseData(detailPage); // 将网页中需要的数据转为对象
				String pyChemicalCNName = hanyu.getStringPinYin(ci.getChemicalCNName()); // 获取程拼音的中文名
				String pyChemicalOtherCNName = hanyu.getStringPinYin(ci.getOtherCNName()); // 获取程拼音的中文其他名
				ci.setQueryKey(pyChemicalCNName + QUERY_KEY_SPAN + ci.getChemicalCNName() + QUERY_KEY_SPAN
						+ pyChemicalOtherCNName + QUERY_KEY_SPAN + ci.getOtherCNName() + QUERY_KEY_SPAN
						+ ci.getChemicalENName() + QUERY_KEY_SPAN + ci.getOtherENName());
				chemicalInfoVoRepository.save(ci); // 将读取的数据拷贝到数据库
				logger.info("获取到化学品数据完成 JSON={}", JSON.toJSONString(ci));
				return;
			} catch (Exception e) {
				logger.error("", e);
			}
		} while (loopCount++ < 10);
		throw new Exception("超过5次下载页面数据失败,网址:" + url);
	}

	/**
	 * 保存异常日志到数据库
	 * 
	 * @param url
	 * @param msg
	 */
	private void saveExceptionLogToDB(String url, String msg) {
		try {
			ChemicalLogVo chemicalLogVo = new ChemicalLogVo();
			chemicalLogVo.setSite(url);
			chemicalLogVo.setMessge(msg);
			chemicalLogVo.setCreateTime(new Date());
			chemicalLogVoRepository.save(chemicalLogVo);
		} catch (Exception logException) {
			logger.error("", logException);
		}
	}

	/**
	 * 从html中解析数据并转换到化学品实例对象
	 * 
	 * @param page
	 * @return
	 * @throws IllegalAccessException
	 * @throws IllegalArgumentException
	 */
	private ChemicalInfoVo parseData(HtmlPage page) throws IllegalArgumentException, IllegalAccessException {
		ChemicalInfoVo ci = new ChemicalInfoVo();
		Elements tdElements = page.getDoc().select("td[class=formname]");
		for (Element titleTd : tdElements) {
			String title = titleTd.select("label").first().attr("for");
			Element valueTd = titleTd.nextElementSibling();
			ci.setValue(title, valueTd.text());
		}
		ci.setCreateTime(new Date());
		return ci;
	}

	/**
	 * 根据页面中的页数元素判断最大的页数
	 * 
	 * @param page
	 * @return
	 */
	private int getMaxPageNum(HtmlPage page) {
		Elements elements = page.getDoc().select("div[class=div_pagenavgation]").select("a");
		int maxPageNum = 0;
		Pattern pattern = Pattern.compile("[0-9]*");
		for (Element element : elements) {
			String pageNum = element.text();
			// 如果字符串是数值型，就判断是否是最大数值
			if (pattern.matcher(pageNum).matches()) {
				maxPageNum = Math.max(maxPageNum, Integer.parseInt(pageNum));
			}
		}

		return maxPageNum;
	}

}
